/*******************************************************************************
																				
	DESCRIPTION:  	This do file runs regressions of LTU risk, duration dependence
					and cyclicality on observables in the baseline 2006 sample.

*******************************************************************************/

clear all
global id_code 133
pause on
set seed 2110

*******************************************************************************
* 1. Import data
*******************************************************************************
* Import Duration Dependence Data from 112_4:
use "${data}/119_3_DurationDependenceBetas_Full_2006.dta", clear

* Generate quintiles
xtile quintBeta1 = beta_1, nq(5)
xtile quintBeta0 = beta_0, nq(5)		
	
* Keep only certain variables for later:
frame change default
frames put LopNr_PersonNr InLnr p_emplAft6M_0M_In beta_0 beta_0_shrunk_ind quintBeta0 ///
	beta_1 beta_1_shrunk_ind quintBeta1, into(data1)
	

* Import cyclicality data from R
use "${data}/114_3_CyclicalityBetas_Full_2006.dta", clear

* Generate quantiles in terms of alfa and beta from the regression in logs
xtile quintBetaU = beta_u, nq(5)
xtile quintBeta0 = beta_0, nq(5)

xtile decBetaU = beta_u, nq(10)
xtile decBeta0 = beta_0, nq(10)
	
* Store desired variables into frame:
frames put LopNr_PersonNr InLnr p_emplAft6M_0M_In beta_0 beta_0_shrunk_ind quintBeta0 ///
	beta_u beta_u_shrunk_ind quintBetaU, into(data2)
	
* Import dataset with with characteristics
use "${data}/002_DataForR_Full_2006", clear

global controls $demoEdu $incIndiv $incOther $emplHist $incHist ///
	$migHist $indu $mun

keep LopNr_PersonNr InLnr $controls	

* Generate tertiary education dummy:
gen educ_tert = EducLevel_dummy5 + EducLevel_dummy6
gen educ_sec = EducLevel_dummy3 + EducLevel_dummy4
gen educ_prim = EducLevel_dummy1 + EducLevel_dummy2

label variable age "Age"
label variable Gender "Female"
label variable L_N_Kids "Kids"
label variable L_N_Kids_U18 "Kids <18"
label variable foreign "Foreign"
label variable educ_tert "Higher Edu."
label variable educ_sec "Secondary Edu."
label variable educ_prim "Primary Edu."
label variable L_WageInc_adj "Labour Income"
label variable L_FamInc_adj "Family Income"
label variable L_OtherInc_adj "Other Income"
label variable DaysUnemp_2Years "Days on UI (2y)"
label variable DaysOnDI_2Years "Days on DI (2y)"
label variable unemplSpells2Ybefore  "Unemp. Spells (2y)"
label variable tenure "Tenure"

* Link with the two other frames:
frame data1 { 
	frlink 1:1 LopNr_PersonNr InLnr, frame(default)
	frlink 1:1 LopNr_PersonNr InLnr, frame(data2)
}

frame data2 {
	frlink 1:1 LopNr_PersonNr InLnr, frame(default)
	frlink 1:1 LopNr_PersonNr InLnr, frame(data1)
}

* Store control variables in macros:
global controls_sel age Gender L_N_Kids L_N_Kids_U18 foreign educ_tert educ_sec ///
	L_WageInc_adj ///
	L_OtherInc_adj L_FamInc_adj ///
	DaysUnemp_2Years unemplSpells2Ybefore DaysOnDI_2Years tenure

* Pull controls from default data frame:
frame change data1 
frget $controls educ* , from(default)
frget beta_u_shrunk_ind quintBetaU, from(data2)

* Generate pi_2006 (predicted 6-M JFR at 0M in 2006):
gen pi_2006 = p_emplAft6M_0M_In

label variable pi_2006 "Pred. JFR"
label variable beta_1_shrunk_ind "Duration Dep. ({&beta}{sub:D})"
label variable beta_u_shrunk_ind "Cyclicality ({&beta}{sub:U})"

*******************************************************************************
* 2. Correlation heatmap
*******************************************************************************
frame change data1

* Obtain correlation matrix for a more restrictive set of covariates:
corr pi_2006 beta_1_shrunk_ind beta_u_shrunk_ind $controls_sel

matrix C = r(C)
matrix C = C[1..., 1..3]

heatplot C, values(size(vsmall) format(%9.2f)) cuts(-1(.1)1) colors(ebblue white orange_red, ipolate(20)) ///
	ramp(right subtitle("Correlation" "Coefficient") format(%9.2f) labels(-1 0 1) graphregion(color(white)) combine(graphregion(color(white)))) ///
	label ylabel(, nogrid labsize(vsmall) angle(0)) xlabel(, labsize(vsmall) angle(0)) graphregion(color(white))	
	
graph export "${output}/${id_code}_Visualizing_Heterogeneity_Correlates_Corr_Heatplot.pdf", as(pdf) replace 
	


*******************************************************************************
* 3. Linear regressions
*******************************************************************************
* First, standardize the variables:
egen std_pi_2006 = std(pi_2006)
egen std_beta_1_shrunk_ind = std(beta_1_shrunk_ind)
egen std_beta_u_shrunk_ind = std(beta_u_shrunk_ind)

label variable std_pi_2006 "Pred. JFR"
label variable std_beta_1_shrunk_ind "Duration Dep. ({&beta}{sub:D})"
label variable std_beta_u_shrunk_ind "Cyclicality ({&beta}{sub:U})"

global std_controls_sel 
foreach v of varlist $controls_sel {
	sum `v'
	if `=r(sd)' != 0 { 
		cap egen std_`v' = std(`v')
		global std_controls_sel $std_controls_sel std_`v'
	}
	else {
		global std_controls_sel $std_controls_sel `v'
	}
}

* Now regress on other variables and store the coefficient estimates:
reg std_pi_2006 $std_controls_sel
estimates store est_pi_2006

reg std_beta_1_shrunk_ind std_pi_2006 $std_controls_sel
estimates store est_beta_d

reg std_beta_u_shrunk_ind std_pi_2006 $std_controls_sel
estimates store est_beta_u

* Fix variable labels for the plots:
label variable std_age "Age"
label variable std_Gender "Female"
label variable std_L_N_Kids "Kids"
label variable std_L_N_Kids_U18 "Kids <18"
label variable std_foreign "Foreign"
label variable std_L_WageInc_adj "Labour Income"
label variable std_DaysUnemp_2Years "Days on UI (2y)"
label variable std_DaysOnDI_2Years "Days on DI (2y)"
label variable std_unemplSpells2Ybefore  "Unemp. Spells (2y)"
label variable std_educ_tert "Higher Edu."
label variable std_educ_sec "Secondary Edu."
label variable std_L_FamInc_adj "Household Income"
label variable std_L_OtherInc_adj "Other Income"
label variable std_tenure "Tenure"

	
* Plot coefficients using coefplot:
coefplot (est_pi_2006, label("Pred. JFR") color(gray) ciopts(color(gray))), ///
		bylabel("") || ///
	(est_beta_d, label("Duration Dep. ({&beta}{sub:D})") color(orange_red) ciopts(color(orange_red))) /// 
	(est_beta_u, label("Cyclicality ({&beta}{sub:U})") color(ebblue) ciopts(color(ebblue))), ///
		bylabel("")  ||, ///
	norecycle recast(bar) barwidth(0.4) cirecast(rcap) citop drop(_cons) ///
	orderby(2) ///
	coeflabels(, labsize(small)) ///
	groups(std_age std_educ_sec = `""{bf:Socio-}" "{bf:demographics}""' ///
		std_L_WageInc_adj std_L_FamInc_adj = `""{bf:Income}" "{bf:Variables}""' ///
		std_DaysUnemp_2Years std_tenure = `""{bf:Employment}" "{bf:History}""' ) ///
	fcolor(*0.5) xline(0) msize(2) ///
	xlabel(-.75(.25).5) ///
	xtitle("Regression Coefficient") ///
	legend(rows(1) size(small) symxsize(*0.6)) ///
	byopts(graphregion(color(white))) ///
	name(coef, replace)
	
graph export "${output}/${id_code}_Visualizing_Heterogeneity_Correlates_Reg_Coefplot.pdf", replace 


*******************************************************************************
* 4. Shapley-Owen decomposition
*******************************************************************************
* Specify whether you want to re-estimate the regressions (takes a while - you may
* want to do it on the batch server):
global reestimate 1

* Change data frame:
frame change data1 

* Next, we decompose the R2 using the Shapley value approach (i.e.,
* calculate all possible regressions and obtain average delta R2 when 
* introducing a group of variables):

* Set variable groups in the following macros:
local controls_pi_2006 ""$demoEdu" "$incIndiv" "$incOther" "$emplHist" "$incHist" "$migHist" "$indu" "$mun""

local controls_beta_1_shrunk_ind ""std_pi_2006" "$demoEdu" "$incIndiv" "$incOther" "$emplHist" "$incHist" "$migHist" "$indu" "$mun""
	
local controls_beta_u_shrunk_ind ""std_pi_2006" "$demoEdu" "$incIndiv" "$incOther" "$emplHist" "$incHist" "$migHist" "$indu" "$mun""
	
* Create frame:
cap frame drop part_r2
frame create part_r2 strL(vars) pi_2006 beta_1_shrunk_ind beta_u_shrunk_ind

* Loop over the three outcome variables:
qui foreach y in pi_2006 beta_1_shrunk_ind beta_u_shrunk_ind {

	cap frame drop r2_`y'
	frame create r2_`y' rsq strL(predictors)

	* Drop observations unless we observe all controls:
	keep if !missing(`controls_`y'')

	* Obtain 2^k unique sets of regressors:
	tuples `controls_`y''
		
	* Loop over the 2^k sets, regressing and saving the R2 to the new frame:
	if $reestimate == 1 {
		forval i = 0/`ntuples' {
			if mod(`i', 20) == 0 | `i' == 1 {
				noisily di "Estimating Regression `i' / `ntuples'"
			}
			regress std_`y' `tuple`i''
			

			frame post r2_`y' (`=e(r2)') ("`tuple`i''")
		}
		
		* Save results:
		 frame r2_`y': save "${output}/${id_code}_R2_`y'.dta", replace
	}
	
}


* Next, use stored results to calculate delta R2 and average over all possible
* specifications:
qui foreach y in pi_2006 beta_1_shrunk_ind beta_u_shrunk_ind {
	frame r2_`y': use "${output}/${id_code}_R2_`y'.dta", clear 
}


* Set number of groups:
local p_pi_2006 = 8
local p_beta_1_shrunk_ind = 9
local p_beta_u_shrunk_ind = 9

* Loop over the three outcome variables:
foreach y in pi_2006 beta_1_shrunk_ind beta_u_shrunk_ind {

	* Create new frame for delta R2 results:
	cap frame drop part_r2_`y'
	frame create part_r2_`y' strL(predictors) drsq
	
	local i = 1
	
	* Change the frame.
	frame change r2_`y'
	
	* Loop over our variable groups:
	foreach v in `controls_`y'' {

		* Generate indicator = 1 if the variable group is present:
		gen ind`i++' = strpos(predictors, "`v'") > 0
	}
	
	* Obtain number of groups in regression and calculate weigths for average:
	egen k = rowtotal(ind1-ind`p_`y'')
	gen w1 = exp(lnfactorial(k-1)) * exp(lnfactorial(`p_`y''-k)) / exp(lnfactorial(`p_`y''))
	gen w0 = exp(lnfactorial(k)) * exp(lnfactorial(`p_`y''-k-1)) / exp(lnfactorial(`p_`y''))
	
	* Generate weighted R2:
	gen rsq_w1 = rsq * w1
	gen rsq_w0 = rsq * w0
	
	* Loop through the regressors again:
	local i = 1
	foreach v in `controls_`y'' {

		* Average R2 without the group:
		sum rsq_w0 if ind`i' == 0
		local rsq_0 = r(sum)
		
		* Average R2 with the group:
		sum rsq_w1 if ind`i++' == 1
		local rsq_1 = r(sum)		
				
		* Delta R2:
		local drsq = `rsq_1' - `rsq_0'
	
		* Save to data frame:
		frame post part_r2_`y' ("`v'") (`drsq')
	}	

}

* Finally, merge the three decompositions:
foreach y in pi_2006 beta_1_shrunk_ind beta_u_shrunk_ind {
	frame part_r2_`y': rename drsq `y'	
}

* Link and merge with other datasets:

frame change part_r2_pi_2006

set obs `=_N+1'
replace predictors = "std_pi_2006" in `=_N'

frlink 1:1 predictors, frame(part_r2_beta_1_shrunk_ind)
frlink 1:1 predictors, frame(part_r2_beta_u_shrunk_ind)

frget beta_1_shrunk_ind, from(part_r2_beta_1_shrunk_ind)
frget beta_u_shrunk_ind, from(part_r2_beta_u_shrunk_ind)

* Clean up a bit:
order predictors pi_2006 beta_1_shrunk_ind beta_u_shrunk_ind

gen ord = _n + 1 in 1/8
replace ord = 1 in 9
sort ord

drop ord part_*

label variable pi_2006 "Pred. JFR"
label variable beta_1_shrunk_ind "Duration Dep. ({&beta}{sub:D})"
label variable beta_u_shrunk_ind "Cyclicality ({&beta}{sub:U})"

* Add row with totals:
set obs `=_N+1'
replace predictors = "TOTAL R2" in `=_N'

sum pi_2006
replace pi_2006 = r(sum) in `=_N'

sum beta_1_shrunk_ind
replace beta_1_shrunk_ind = r(sum) in `=_N'

sum beta_u_shrunk_ind
replace beta_u_shrunk_ind = r(sum) in `=_N'

* Transform into matrix:
mkmat pi_2006 beta_1_shrunk_ind beta_u_shrunk_ind, matrix(R)
matrix rownames R = "pi_2006" "Basic Socio-demographics" "Individual Income" "Other Income" "Employment History" "Income History" "Migration History" "Industry" "Municipality" "{bf} Total R{sup}2"
matrix list R
matrix A = R
matrix A[10, 1] = .
matrix A[10, 2] = .
matrix A[10, 3] = .

* Plot heatmap of the part R2:
heatplot A, values(size(vsmall) format(%9.3f)) cuts(0(.025).3) colors(white orange_red, ipolate(12)) ///
	text(10 1 "{bf}`: di %5.3f `= R[10, 1]''" 10 2 "{bf}`: di %5.3f `= R[10, 2]''" 10 3 "{bf}`: di %5.3f `= R[10, 3]''", placement(n) justification(left) margin(zero) size(vsmall)) ///
	ramp(right subtitle("Shapley" "Values") format(%9.2f) labels(0(0.1)0.3) graphregion(color(white)) combine(graphregion(color(white)))) ///
	label ylabel(, nogrid labsize(vsmall)) xlabel(, labsize(vsmall)) graphregion(color(white))

graph export "${output}/${id_code}_Visualizing_Heterogeneity_Correlates_PartR2_Heatplot.pdf", as(pdf) replace 
